In [67]:
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import HTML
from EM import EM
from utils import gen_data, plot_single, animate_plot, compute_accuracy
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
In [68]:
## Example 1
np.random.seed(1111)
# Generate data from a GMM with 3 clusters and 2 dimensions
k = 3
p = 2
probs = [0.3, 0.5, 0.2]
X1, true_means1, true_covs1 = gen_data(k, p, 5000, probs, lim = [-50, 50])
title = r"GMM with $k=3$ components and $p=2$ dimensions"
plot_single(X1, k, title, true_param = (true_means1, true_covs1), filename = "Examples/2D case 1/original")
K-means initialization
In [69]:
mu_km1, sigma_km1, pi_km1, snapshots_km1, lls_km1 = EM(k, p, X1, 20, 1e-6, init_kmeans = True)
Initializing under K-Means...
0%| | 0/20 [00:00<?, ?it/s]
EM converged at iteration 2!
In [70]:
anim = animate_plot(snapshots_km1, X1, k, true_means1, true_covs1, True, "Examples/2D case 1/kmeans")
plt.close()
HTML(anim.to_jshtml())
Out[70]:
In [71]:
# Plot log-likelihood
plt.plot(range(1, len(lls_km1)), lls_km1[1:], marker='.')
plt.title("Log-likelihood (under K-Means initialization)")
plt.xlabel("Iteration")
plt.xticks(range(1, len(lls_km1)))
plt.show()
Random from data initialization
In [72]:
mu_rand1, sigma_rand1, pi_rand1, snapshots_rand1, lls_rand1 = EM(k, p, X1, 20, 1e-6, init_kmeans = False)
Initializing under random selection...
0%| | 0/20 [00:00<?, ?it/s]
In [73]:
anim = animate_plot(snapshots_rand1, X1, k, true_means1, true_covs1, False, "Examples/2D case 1/random")
plt.close()
HTML(anim.to_jshtml())
Out[73]:
In [74]:
# Plot log-likelihood
plt.plot(range(1, len(lls_rand1)), lls_rand1[1:], marker='.')
plt.title("Log-likelihood (under random from data initialization)")
plt.xlabel("Iteration")
plt.xticks(range(1, len(lls_rand1)))
plt.show()
To compute the 'accuracy' of our methods, we compare the MSE of the means and as the covariance matrices are multi-dimensional, we compute the trace and determinant, as well as the Frobenius norm.
In [75]:
results_km1 = compute_accuracy(true_means1, true_covs1, probs, mu_km1, sigma_km1, pi_km1)
results_rand1 = compute_accuracy(true_means1, true_covs1, probs, mu_rand1, sigma_rand1, pi_rand1)
print("Mean and Covariance evaluation")
print("-------------------------------------------------")
print("-------------------------------------------------")
print("EM estimates: K-Means initialization")
print("log-likelihood:", lls_km1[-1])
print("------------------------------")
print("K-means initialization")
print("Mean MSE:", results_km1[0])
print("Average |diff(det(Sigma))|:", results_km1[1][0])
print("Average |diff(tr(Sigma))|:", results_km1[1][1])
print("Average |diff(Sigma)|_F:", results_km1[1][1])
print("Average D_KL(C):", results_km1[2])
print("Average D_KL(pi):", results_km1[3])
print()
print("\n-------------------------------------------------")
print("EM estimates: random from data initialization")
print("log-likelihood:", lls_rand1[-1])
print("------------------------------")
print("Random from data initialization")
print("Mean MSE:", results_rand1[0])
print("Average |diff(det(Sigma))|:", results_rand1[1][0])
print("Average |diff(tr(Sigma))|:", results_rand1[1][1])
print("Average |diff(Sigma)|_F:", results_rand1[1][1])
print("Average D_KL(C):", results_rand1[2])
print("Average D_KL(pi):", results_rand1[3])
Mean and Covariance evaluation ------------------------------------------------- ------------------------------------------------- EM estimates: K-Means initialization log-likelihood: -23628.576273537983 ------------------------------ K-means initialization Mean MSE: 0.007218703784598276 Average |diff(det(Sigma))|: 0.12845194004352312 Average |diff(tr(Sigma))|: 0.1914839714002247 Average |diff(Sigma)|_F: 0.1914839714002247 Average D_KL(C): 0.0022516332237062925 Average D_KL(pi): 0.0001377910282519242 ------------------------------------------------- EM estimates: random from data initialization log-likelihood: -23628.576279563604 ------------------------------ Random from data initialization Mean MSE: 0.007219558904899555 Average |diff(det(Sigma))|: 0.1289346023465908 Average |diff(tr(Sigma))|: 0.1916174409758146 Average |diff(Sigma)|_F: 0.1916174409758146 Average D_KL(C): 0.0022504368444096192 Average D_KL(pi): 0.0001377947325556213
In [76]:
## Example 2
np.random.seed(1)
# Generate data from a GMM with 3 clusters and 2 dimensions
k = 3
p = 2
probs = [0.3, 0.5, 0.2]
X2, true_means2, true_covs2 = gen_data(k, p, 5000, probs, lim = [-50, 50])
title = r"GMM with $k=3$ components and $p=2$ dimensions"
plot_single(X2, k, title, true_param = (true_means2, true_covs2), filename = "Examples/2D case 2/original")
K-Means initialization
In [77]:
mu_km2, sigma_km2, pi_km2, snapshots_km2, lls_km2 = EM(k, p, X2, 20, 1e-6, init_kmeans = True)
Initializing under K-Means...
0%| | 0/20 [00:00<?, ?it/s]
EM converged at iteration 2!
In [78]:
anim = animate_plot(snapshots_km2, X2, k, true_means2, true_covs2, True, "Examples/2D case 2/kmeans")
plt.close()
HTML(anim.to_jshtml())
Out[78]:
In [79]:
# Plot log-likelihood
plt.plot(range(1, len(lls_km2)), lls_km2[1:], marker='.')
plt.title("Log-likelihood (under K-Means initialization)")
plt.xlabel("Iteration")
plt.xticks(range(1, len(lls_km2)))
plt.show()
Random from data initialization
In [80]:
mu_rand2, sigma_rand2, pi_rand2, snapshots_rand2, lls_rand2 = EM(k, p, X2, 20, 1e-6, init_kmeans = False)
Initializing under random selection...
0%| | 0/20 [00:00<?, ?it/s]
In [81]:
anim = animate_plot(snapshots_rand2, X2, k, true_means2, true_covs2, False, "Examples/2D case 2/random")
plt.close()
HTML(anim.to_jshtml())
Out[81]:
In [82]:
# Plot log-likelihood
plt.plot(range(1, len(lls_rand2)), lls_rand2[1:], marker='.')
plt.title("Log-likelihood (under random from data initialization)")
plt.xlabel("Iteration")
plt.xticks(range(1, len(lls_rand2)))
plt.show()
In [83]:
results_km2 = compute_accuracy(true_means2, true_covs2, probs, mu_km2, sigma_km2, pi_km2)
results_rand2 = compute_accuracy(true_means2, true_covs2, probs, mu_rand2, sigma_rand2, pi_rand2)
print("Mean and Covariance evaluation")
print("-------------------------------------------------")
print("-------------------------------------------------")
print("EM estimates: K-Means initialization")
print("log-likelihood:", lls_km2[-1])
print("------------------------------")
print("K-means initialization")
print("Mean MSE:", results_km2[0])
print("Average |diff(det(Sigma))|:", results_km2[1][0])
print("Average |diff(tr(Sigma))|:", results_km2[1][1])
print("Average |diff(Sigma)|_F:", results_km2[1][1])
print("Average D_KL(C):", results_km2[2])
print("Average D_KL(pi):", results_km2[3])
print()
print("\n-------------------------------------------------")
print("EM estimates: random from data initialization")
print("log-likelihood:", lls_rand2[-1])
print("------------------------------")
print("Random from data initialization")
print("Mean MSE:", results_rand2[0])
print("Average |diff(det(Sigma))|:", results_rand2[1][0])
print("Average |diff(tr(Sigma))|:", results_rand2[1][1])
print("Average |diff(Sigma)|_F:", results_rand2[1][1])
print("Average D_KL(C):", results_rand2[2])
print("Average D_KL(pi):", results_rand2[3])
Mean and Covariance evaluation ------------------------------------------------- ------------------------------------------------- EM estimates: K-Means initialization log-likelihood: -23450.980136778926 ------------------------------ K-means initialization Mean MSE: 0.006552332921552798 Average |diff(det(Sigma))|: 0.2397890736769246 Average |diff(tr(Sigma))|: 0.32335436017762004 Average |diff(Sigma)|_F: 0.32335436017762004 Average D_KL(C): 0.0028345964004957334 Average D_KL(pi): 0.0001815281871258602 ------------------------------------------------- EM estimates: random from data initialization log-likelihood: -30203.270937562604 ------------------------------ Random from data initialization Mean MSE: 1273.6151440536107 Average |diff(det(Sigma))|: 300.70408079259056 Average |diff(tr(Sigma))|: 46.24345240362512 Average |diff(Sigma)|_F: 46.24345240362512 Average D_KL(C): 107.16643320417587 Average D_KL(pi): 0.5910927614091862
In [84]:
## Example 3
np.random.seed(1234)
# Generate data from a GMM with 4 clusters and 3 dimensions
k = 4
p = 3
probs = [0.2, 0.3, 0.3, 0.2]
X3, true_means3, true_covs3 = gen_data(k, p, 5000, probs, lim = [-50, 50])
K-Means initialization
In [85]:
mu_km3, sigma_km3, pi_km3, snapshots_km3, lls_km3 = EM(k, p, X3, 50, 1e-6, init_kmeans = True)
Initializing under K-Means...
0%| | 0/50 [00:00<?, ?it/s]
EM converged at iteration 2!
In [86]:
# Plot log-likelihood
plt.plot(range(1, len(lls_km3)), lls_km3[1:], marker='.')
plt.title("Log-likelihood (under K-Means initialization)")
plt.xlabel("Iteration")
plt.xticks(range(1, len(lls_km3)))
plt.show()
Random from data initialization
In [87]:
mu_rand3, sigma_rand3, pi_rand3, snapshots_rand3, lls_rand3 = EM(k, p, X3, 50, 1e-6, init_kmeans = False)
Initializing under random selection...
0%| | 0/50 [00:00<?, ?it/s]
EM converged at iteration 20!
In [88]:
# Plot log-likelihood
plt.plot(range(1, len(lls_rand3)), lls_rand3[1:], marker='.')
plt.title("Log-likelihood (under random from data initialization)")
plt.xlabel("Iteration")
plt.xticks(range(1, len(lls_rand3)))
plt.show()
In [89]:
results_km3 = compute_accuracy(true_means3, true_covs3, probs, mu_km3, sigma_km3, pi_km3)
results_rand3 = compute_accuracy(true_means3, true_covs3, probs, mu_rand3, sigma_rand3, pi_rand3)
print("Mean and Covariance evaluation")
print("-------------------------------------------------")
print("-------------------------------------------------")
print("EM estimates: K-Means initialization")
print("log-likelihood:", lls_km3[-1])
print("------------------------------")
print("K-means initialization")
print("Mean MSE:", results_km3[0])
print("Average |diff(det(Sigma))|:", results_km3[1][0])
print("Average |diff(tr(Sigma))|:", results_km3[1][1])
print("Average |diff(Sigma)|_F:", results_km3[1][1])
print("Average D_KL(C):", results_km3[2])
print("Average D_KL(pi):", results_km3[3])
print()
print("\n-------------------------------------------------")
print("EM estimates: random from data initialization")
print("log-likelihood:", lls_rand3[-1])
print("------------------------------")
print("Random from data initialization")
print("Mean MSE:", results_rand3[0])
print("Average |diff(det(Sigma))|:", results_rand3[1][0])
print("Average |diff(tr(Sigma))|:", results_rand3[1][1])
print("Average |diff(Sigma)|_F:", results_rand3[1][1])
print("Average D_KL(C):", results_rand3[2])
print("Average D_KL(pi):", results_rand3[3])
Mean and Covariance evaluation ------------------------------------------------- ------------------------------------------------- EM estimates: K-Means initialization log-likelihood: -34025.391338430374 ------------------------------ K-means initialization Mean MSE: 0.006847396847139687 Average |diff(det(Sigma))|: 0.5806301860544005 Average |diff(tr(Sigma))|: 0.15421342641086389 Average |diff(Sigma)|_F: 0.15421342641086389 Average D_KL(C): 0.0027228113950643457 Average D_KL(pi): 0.0001983948147091484 ------------------------------------------------- EM estimates: random from data initialization log-likelihood: -34025.391338430374 ------------------------------ Random from data initialization Mean MSE: 0.006847396847139687 Average |diff(det(Sigma))|: 0.5806301860544005 Average |diff(tr(Sigma))|: 0.15421342641086389 Average |diff(Sigma)|_F: 0.15421342641086389 Average D_KL(C): 0.0027228113950643457 Average D_KL(pi): 0.0001983948147091484
Example 4¶
$k = 3$ clusters and $p=2$ dimensions
Here, we specify the clusters to be close to each other.
In [90]:
## Example 4
np.random.seed(1111)
# Generate data from a GMM with 3 clusters and 2 dimensions
k = 3
p = 2
probs = [0.3, 0.5, 0.2]
true_means4 = np.array([[0, 0], [7, 2], [-7, -2]])
true_covs4 = np.array([[[5, 4], [4, 5]], [[3, -2], [-2, 3]], [[3, -2], [-2, 4]]])
X4 = np.zeros((5000, 2))
for i in range(5000):
z = np.random.choice(k, p = probs)
X4[i] = np.random.multivariate_normal(true_means4[z], true_covs4[z])
title = r"GMM with $k=3$ components and $p=2$ dimensions"
plot_single(X4, k, title, true_param = (true_means4, true_covs4), filename = "Examples/2D case 4/original")
K-means initialization
In [91]:
mu_km4, sigma_km4, pi_km4, snapshots_km4, lls_km4 = EM(k, p, X4, 50, 1e-6, init_kmeans = True)
Initializing under K-Means...
0%| | 0/50 [00:00<?, ?it/s]
EM converged at iteration 46!
In [92]:
anim = animate_plot(snapshots_km4, X4, k, true_means4, true_covs4, True, "Examples/2D case 4/kmeans")
plt.close()
HTML(anim.to_jshtml())
Out[92]:
In [93]:
# Plot log-likelihood
plt.plot(range(1, len(lls_km4)), lls_km4[1:], marker='.')
plt.title("Log-likelihood (under K-Means initialization)")
plt.xlabel("Iteration")
plt.xticks(range(1, len(lls_km4)))
plt.show()
Random from data initialization
In [94]:
mu_rand4, sigma_rand4, pi_rand4, snapshots_rand4, lls_rand4 = EM(k, p, X4, 50, 1e-6, init_kmeans = False)
Initializing under random selection...
0%| | 0/50 [00:00<?, ?it/s]
In [95]:
anim = animate_plot(snapshots_rand4, X4, k, true_means4, true_covs4, False, "Examples/2D case 4/random")
plt.close()
HTML(anim.to_jshtml())
Out[95]:
In [96]:
# Plot log-likelihood
plt.plot(range(1, len(lls_rand4)), lls_rand4[1:], marker='.')
plt.title("Log-likelihood (under random from data initialization)")
plt.xlabel("Iteration")
plt.xticks(range(1, len(lls_rand4)))
plt.show()
In [97]:
results_km4 = compute_accuracy(true_means4, true_covs4, probs, mu_rand4, sigma_rand4, pi_rand4)
results_rand4 = compute_accuracy(true_means4, true_covs4, probs, mu_rand4, sigma_rand4, pi_rand4)
print("Mean and Covariance evaluation")
print("-------------------------------------------------")
print("-------------------------------------------------")
print("EM estimates: K-Means initialization")
print("log-likelihood:", lls_km4[-1])
print("------------------------------")
print("K-means initialization")
print("Mean MSE:", results_km4[0])
print("Average |diff(det(Sigma))|:", results_km4[1][0])
print("Average |diff(tr(Sigma))|:", results_km4[1][1])
print("Average |diff(Sigma)|_F:", results_km4[1][1])
print("Average D_KL(C):", results_km4[2])
print("Average D_KL(pi):", results_km4[3])
print()
print("\n-------------------------------------------------")
print("EM estimates: random from data initialization")
print("log-likelihood:", lls_rand4[-1])
print("------------------------------")
print("Random from data initialization")
print("Mean MSE:", results_rand4[0])
print("Average |diff(det(Sigma))|:", results_rand4[1][0])
print("Average |diff(tr(Sigma))|:", results_rand4[1][1])
print("Average |diff(Sigma)|_F:", results_rand4[1][1])
print("Average D_KL(C):", results_rand4[2])
print("Average D_KL(pi):", results_rand4[3])
Mean and Covariance evaluation ------------------------------------------------- ------------------------------------------------- EM estimates: K-Means initialization log-likelihood: -23598.238991284074 ------------------------------ K-means initialization Mean MSE: 0.01349834175106591 Average |diff(det(Sigma))|: 0.20576100757575114 Average |diff(tr(Sigma))|: 0.13814015175120353 Average |diff(Sigma)|_F: 0.13814015175120353 Average D_KL(C): 0.0027423538923110925 Average D_KL(pi): 6.122792618359907e-05 ------------------------------------------------- EM estimates: random from data initialization log-likelihood: -23598.23910720137 ------------------------------ Random from data initialization Mean MSE: 0.01349834175106591 Average |diff(det(Sigma))|: 0.20576100757575114 Average |diff(tr(Sigma))|: 0.13814015175120353 Average |diff(Sigma)|_F: 0.13814015175120353 Average D_KL(C): 0.0027423538923110925 Average D_KL(pi): 6.122792618359907e-05